MitoPerturbSeq heteroplasmy analysis

Author

Abhilesh Dhawanjewar

Published

November 11, 2025

Load and preprocess data
# Load data
mtDNA_data <- read.csv(
    here(data_dir, "integrated_dataset_metadata.csv"))
# There are 76 rows with heteroplasmy values as NAs

# Load mixscape data
mixscape_data <- read.csv(
    here(data_dir, "integrated_mixscape_metadata.csv"))

# This function predicts technical measurement variance based on mtDNA depth and is used for empirical weighting in linear models
het_sim_df <- readRDS(
    here(data_dir, "het_depth_sim.rds")
)
get_measurement_variance <- readRDS(
    here(data_dir, "measurement_variance_function.rds")
)

# Change labels for "Park2" and "Nontargeting"
mtDNA_data$top_guide_final_enrich_combined <-   mtDNA_data$top_guide_final_enrich_combined %>%
    replace(mtDNA_data$top_guide_final_enrich_combined == "Park2", "Prkn") %>%
    replace(mtDNA_data$top_guide_final_enrich_combined == "Nontargeting", "NT")

gRNA_order <- c(
        "Akap1",
        "Nnt",
        "Polg",
        "Tfam",
        "Dnm1l",
        "Mtfp1",
        "Opa1",
        "Snx9",
        "Atg5",
        "Oma1",
        "Pink1",
        "Ppargc1a",
        "Prkn",
        "Eomes",
        "Neurod1",
        "Olig1",
        "Opn4",
        "Rgr",
        "Rrh",
        "NT"
    )

# Order the factor levels
mtDNA_data$top_guide_final_enrich_combined <- factor(
    mtDNA_data$top_guide_final_enrich_combined,
    levels = gRNA_order
)

mtDNA_data_by_gRNA <- mtDNA_data %>%
    group_by(top_guide_final_enrich_combined) %>%
    dplyr::summarise(
        mean_het = mean(Heteroplasmy_all_5024, na.rm = TRUE),
        var_het = var(Heteroplasmy_all_5024, na.rm = TRUE),
        mean_depth = mean(mtDNA_depth, na.rm = TRUE),
        n_cells = n()
    ) %>%
    arrange(desc(mean_het))

# Add group labels to the gRNA genes
mtDNA_data_by_gRNA <- mtDNA_data_by_gRNA %>%
    dplyr::mutate(group = top_guide_final_enrich_combined)

mtDNA_data_by_gRNA$group <- factor(
    mtDNA_data_by_gRNA$group,
    levels = gRNA_order
)

mtDNA_data_by_gRNA <- mtDNA_data_by_gRNA %>%
    dplyr::mutate(func_group = case_when(
        group %in% c("Akap1", "Nnt", "Polg", "Tfam") ~ "Maintanence Genes",
        group %in% c("Dnm1l", "Mtfp1", "Opa1", "Snx9") ~ "Fission/Fusion",
        group %in% c("Atg5", "Oma1", "Pink1", "Ppargc1a", "Prkn") ~ "Mitophagy",
        group %in% c("Eomes", "Neurod1", "Olig1", "Opn4", "Rgr", "Rrh", "NT") ~ "Control"
    ))

# Add a grouping variable to the gRNA genes
mtDNA_data <- mtDNA_data %>%
    dplyr::mutate(group = case_when(
        top_guide_final_enrich_combined %in% c("Akap1", "Nnt", "Polg", "Tfam") ~ "Maintanence Genes",
        top_guide_final_enrich_combined %in% c("Dnm1l", "Mtfp1", "Opa1", "Snx9") ~ "Fission/Fusion",
        top_guide_final_enrich_combined %in% c("Atg5", "Oma1", "Pink1", "Ppargc1a", "Prkn") ~ "Mitophagy",
        top_guide_final_enrich_combined %in% c("Eomes", "Neurod1", "Olig1", "Opn4", "Rgr", "Rrh", "NT") ~ "Control"
    ))

mtDNA_data$group <- factor(
    mtDNA_data$group,
    levels = c("Maintanence Genes", "Fission/Fusion", "Mitophagy", "Control")
)

# Define colors for the gRNAs
col_pal <- paletteer_d("ggsci::category20_d3")
names(col_pal) <- gRNA_order

# Highlight color
custom_salmon <- saturation("#FA8072", 0.4)

Distributions of mtDNA depth and heteroplasmy

Figure 1: Ridge plots showing heteroplasmy distributions across gRNAs. TFAM, POLG, and OPA1 show broader distributions.

Applying mtDNA depth cutoff > 20

In-silico modelling of heteroplasmy calling based on sub-sampling of a simulated heteroplasmic mtDNA population suggested an increase in sampling noise at mtDNA depths < 20. However, the most dramatic perturbations (TFAM, POLG, OPA1) also exhibit significant mtDNA copy number reduction.

Figure 2: Number of cells and proportion passing mtDNA depth > 20 threshold by gRNA. Top panel shows total cells (blue) and cells passing threshold (green). Bottom panel shows proportion passing threshold. TFAM, POLG, and OPA1 show the largest drops in cells passing threshold.

The gRNAs associated with mtDNA copy number reduction also show the greatest drop of cells passing the mtDNA depth > 20 threshold.

gRNA Total Cells Cells Passing Threshold Proportion Passing
Tfam 271 50 0.18
Polg 287 103 0.36
Opa1 307 153 0.5

Applying a read depth threshold increases the accuracy of the per-cell heteroplasmy calls, which is an important consideration for downstream analyses that aim to identify subtle shifts in mean heteroplasmy. However, in the context of gene perturbations that result in a depletion of mtDNA copy number, it also risks losing genuine biological signal by excluding the cells with the most pronounced perturbations.

Heteroplasmy variance by gRNA

Figure 3: Bar plot showing the variance of heteroplasmy levels across different gRNA perturbations.

We used the Brown-Forsythe test to compare heteroplasmy variance between each gRNA and pooled Control gRNAs (Eomes, Neurod1, Olig1, Opn4, Rgr, Rrh, NT). P-values were adjusted for multiple testing using the Holm method. Significant results (p.adj.holm < 0.05) are highlighted in the table below.

Pairwise Brown-Forsythe test for heteroplasmy variance between each gRNA and Control gRNAs
mtDNA_data_non_control <- mtDNA_data %>%
    filter(group != "Control") %>%
    drop_na(Heteroplasmy_all_5024)

mtDNA_data_control <- mtDNA_data %>%
    filter(group == "Control") %>%
    drop_na(Heteroplasmy_all_5024)

non_control_gRNAs <- unique(
    mtDNA_data_non_control$top_guide_final_enrich_combined)

# Run pairwise Brown-Forsythe tests
pairwise_var_test <- map_dfr(
    non_control_gRNAs, function(gRNA) {

    gRNA_data <- mtDNA_data_non_control %>%
        filter(top_guide_final_enrich_combined == gRNA)

    pair_data <- bind_rows(
        gRNA_data %>% mutate(Group = gRNA),
        mtDNA_data_control %>% mutate(Group = "Control")
    ) %>%
        mutate(Group = factor(Group))

    bf_test <- leveneTest(
        Heteroplasmy_all_5024 ~ Group,
        data = pair_data,
        center = median
    )

    data.frame(
        gRNA = as.character(gRNA),
        p_value = bf_test$`Pr(>F)`[1]
    )
})

# Apply Holm correction
pairwise_results <- pairwise_var_test %>%
    filter(!is.na(p_value)) %>%
    mutate(
        p.adj.holm = p.adjust(p_value, method = "holm"),
        significant = p.adj.holm < 0.05
    ) %>%
    arrange(p.adj.holm)
Pairwise Brown-Forsythe Test for Heteroplasmy Variance (gRNA vs Control)
gRNA p-value p.adj (Holm)
Tfam 3.36e-80 4.37e-79
Polg 1.14e-29 1.37e-28
Opa1 1.16e-10 1.27e-09
Dnm1l 4.86e-03 4.86e-02
Nnt 2.41e-02 2.16e-01
Atg5 6.46e-02 5.17e-01
Akap1 8.18e-01 1.00e+00
Prkn 6.66e-01 1.00e+00
Oma1 3.35e-01 1.00e+00
Mtfp1 3.93e-01 1.00e+00
Pink1 5.31e-01 1.00e+00
Ppargc1a 8.78e-01 1.00e+00
Snx9 5.01e-01 1.00e+00


The gRNA KO groups for Tfam, Opa1, Polg, and Dnm1l show significantly increased heteroplasmy variance compared to Control gRNAs (Holm-adjusted p-value < 0.05).

Heteroplasmy variance by Mixscape class

Mixscape is a computational method that classifies cells as knockout (KO) or non-perturbed (NP) based on their transcriptional perturbation signature. This classification allows us to distinguish cells with successful gene knockdowns from cells that received the gRNA but show no transcriptional perturbation, providing an internal control for off-target effects within each gRNA group.

Figure 4: Heteroplasmy variance by Mixscape classification. Bar plot shows variance for knockout (KO) and non-perturbed (NP) cells within each gRNA group. KO cells generally show higher variance than their NP counterparts.
Pairwise Brown-Forsythe test for Mixscape KO vs NP comparison
# Brown-Forsythe test for each pair of KO vs NP
genes <- c("Tfam", "Opa1", "Polg", "Atg5")

pairwise_mixscape_var_test <- list()

for (gene in genes) {

    ko_label <- paste(gene, "KO")
    np_label <- paste(gene, "NP")

    ko_data <- mixscape_data %>% dplyr::filter(mixscape_class == ko_label)
    np_data <- mixscape_data %>% dplyr::filter(mixscape_class == np_label)

    pair_data <- bind_rows(
        ko_data %>% mutate(Group = "KO"),
        np_data %>% mutate(Group = "NP")
    )

    pair_data <- pair_data %>%
        mutate(Group = factor(Group))

    if (nrow(ko_data) > 0 && nrow(np_data) > 0) {
        BF_test <- car::leveneTest(
            Heteroplasmy_all_5024 ~ Group,
            data = pair_data,
            center = median
        )
        p_val <- BF_test$`Pr(>F)`[1]
    } else {
        p_val <- NA
    }

    pairwise_mixscape_var_test[[gene]] <- data.frame(
        gRNA = gene,
        p_value = p_val
    )
}

pairwise_mixscape_var_test_df <- bind_rows(pairwise_mixscape_var_test) %>%
    dplyr::filter(!is.na(p_value)) %>%
    mutate(
        p.adj.holm = p.adjust(p_value, method = "holm"),
        significant = p.adj.holm < 0.05
    ) %>%
    arrange(p.adj.holm)

pairwise_mixscape_var_test_df %>%
    mutate(
        p_value = formatC(p_value, format = "e", digits = 2),
        p.adj.holm = formatC(p.adj.holm, format = "e", digits = 2)
    ) %>%
    select(-significant) %>%
    kbl(
        format = "html",
        caption = "Pairwise Brown-Forsythe Test: KO vs NP Heteroplasmy Variance (Mixscape)",
        col.names = c("gRNA", "p-value", "p.adj (Holm)"),
        align = "c"
    ) %>%
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
    column_spec(
        3,  # p.adj.holm column
        background = ifelse(
            as.numeric(pairwise_mixscape_var_test_df$p.adj.holm) < 0.05,
            custom_salmon,
            "white"
        )
    )
Pairwise Brown-Forsythe Test: KO vs NP Heteroplasmy Variance (Mixscape)
gRNA p-value p.adj (Holm)
Polg 4.73e-13 1.89e-12
Opa1 6.99e-10 2.10e-09
Tfam 4.12e-08 8.23e-08
Atg5 9.84e-01 9.84e-01

Heteroplasmy variance by mtDNA depth

To visualize the relationship between mtDNA coverage and heteroplasmy variance, we binned all cells into 100 quantile-based bins (equal number of cells per bin) based on mtDNA depth. For each bin, we calculated the mean mtDNA depth and heteroplasmy variance, then fit a LOESS curve to approximate the technical variance-depth relationship. Individual gRNA perturbation means (colored points) are overlaid to identify perturbations that deviate from the baseline technical trend.

Figure 5: Relationship between mtDNA depth and heteroplasmy variance. Gray points represent 100 quantile-based bins across all cells. Blue LOESS curve shows the technical variance-depth trend. Colored points show mean variance and depth for each gRNA perturbation. TFAM, POLG, and OPA1 (labeled) deviate substantially from the baseline technical trend.

Linear Modeling of Heteroplasmy Variance by gRNA and mtDNA Depth

To distinguish technical from biological effects on heteroplasmy variance, we fit a weighted linear interaction model. Since variance is a population-level metric, we model heteroplasmy deviation at the single-cell level, defined as each cell’s absolute distance from its gRNA group’s median:

\[ d_{i} = | \ h_{i} - \text{median}(h_{\text{gRNA}_i}) \ |\]

where \(h_i\) is the heteroplasmy of cell \(i\), and the median is calculated across all cells with the same gRNA perturbation.

Model specification:

\[ d_i = \beta_0 + \beta_{\text{gRNA}_i} + \beta_{\text{depth}} \cdot \text{depth}_i + \beta_{\text{gRNA}_i \times \text{depth}} \cdot \text{depth}_i + \epsilon_i\]

This model tests three effects:

  • \(\beta_0\) (intercept): Baseline heteroplasmy deviation in NT cells at depth = 0
  • \(\beta_{\text{gRNA}_i}\) (main gRNA effect): Change in heteroplasmy deviation associated with each gRNA perturbation, independent of mtDNA depth
  • \(\beta_{\text{depth}}\) (main depth effect): Change in heteroplasmy deviation associated with increasing mtDNA depth, independent of gRNA perturbation
  • \(\beta_{\text{gRNA}_i \times \text{depth}}\) (interaction effect): Modification of the depth effect for each gRNA perturbation

Weights are empirical measurement variances derived from heteroplasmy-depth simulations. We calculated weights as the inverse of measurement variance (\(w_i = 1/\sigma^2_i\)) and normalized by dividing by the maximum weight. This approach down-weights cells with low mtDNA depth, where technical noise is substantial, and up-weights cells with high mtDNA depth, where heteroplasmy estimates are more reliable.

Fit weighted linear model: deviation ~ gRNA × mtDNA_depth
# Data preparation
modeling_df <- mtDNA_data %>%
    dplyr::filter(!is.na(Heteroplasmy_all_5024), !is.na(mtDNA_depth)) %>%
    dplyr::group_by(top_guide_final_enrich_combined) %>%
    dplyr::mutate(
        median_het = median(Heteroplasmy_all_5024),
        deviation = abs(Heteroplasmy_all_5024 - median_het)
    ) %>%
    ungroup()

# Set reference level and calculate empirical weights
modeling_df$top_guide_final_enrich_combined <- relevel(
    modeling_df$top_guide_final_enrich_combined, ref = "NT"
)

modeling_df$measurement_var <- get_measurement_variance(modeling_df$mtDNA_depth)
modeling_df$weight_empirical <- 1 / modeling_df$measurement_var
modeling_df$weight_empirical <- modeling_df$weight_empirical / max(modeling_df$weight_empirical)

# Fit weighted linear model
mdl_empirical_weighted <- lm(
    deviation ~ top_guide_final_enrich_combined * mtDNA_depth,
    data = modeling_df,
    weights = weight_empirical
)
ANOVA: Heteroplasmy Variance by gRNA and mtDNA Depth
Term df Sum Sq Mean Sq F statistic p-value
gRNA 19 0.3373 0.0178 11.8198 <0.001
mtDNA_depth 1 0.2140 0.2140 142.4630 <0.001
gRNA:mtDNA_depth 19 0.1362 0.0072 4.7735 <0.001
Residuals 6355 9.5448 0.0015 NA NA

The ANOVA results demonstrate that both gRNA identity and mtDNA depth significantly influence heteroplasmy variance (both p < 0.001). Importantly, the significant interaction term (gRNA × mtDNA depth, p < 0.001) indicates that the relationship between mtDNA depth and heteroplasmy variance differs across gRNA perturbations.

Significant Effects: Heteroplasmy Variance Model (p
Type Term Estimate SE CI Lower CI Upper t-stat p-value
Biological (gRNA) Opa1 0.03436 0.00934 0.01604 0.05268 3.67698 <0.001
Biological (gRNA) Polg 0.05255 0.00988 0.03318 0.07192 5.31740 <0.001
Biological (gRNA) Tfam 0.11115 0.01037 0.09082 0.13149 10.71388 <0.001
Interaction Oma1:mtDNA_depth -0.00018 0.00009 -0.00035 -0.00002 -2.14284 0.0322
Interaction Opa1:mtDNA_depth -0.00039 0.00010 -0.00058 -0.00020 -4.05145 <0.001
Interaction Polg:mtDNA_depth -0.00043 0.00010 -0.00062 -0.00023 -4.33775 <0.001
Interaction Tfam:mtDNA_depth -0.00105 0.00015 -0.00134 -0.00076 -7.19147 <0.001

The coefficient table reveals that TFAM, POLG, and OPA1 show the strongest main effects on heteroplasmy variance, with positive coefficients indicating increased deviation from median heteroplasmy compared to non-targeting controls. The significant negative interaction terms for these genes (e.g., Tfam:mtDNA_depth) indicate that their variance-increasing effect is more pronounced at lower mtDNA depths, consistent with these perturbations causing both mtDNA depletion and increased stochasticity. This directly addresses the reviewer’s question: cells with low mtDNA coverage show higher variance, particularly in perturbations that deplete mtDNA copy number.

Fit weighted linear model for Mixscape data: deviation ~ group × mtDNA_depth
# Prepare Mixscape data
mixscape_modeling_df <- mixscape_data %>%
    filter(str_detect(mixscape_class, "Tfam|Opa1|Polg|Atg5")) %>%
    filter(!is.na(Heteroplasmy_all_5024), !is.na(mtDNA_depth)) %>%
    group_by(mixscape_class) %>%
    mutate(
        median_het = median(Heteroplasmy_all_5024),
        deviation = abs(Heteroplasmy_all_5024 - median_het)
    ) %>%
    ungroup() %>%
    mutate(
        comparison_group = if_else(
            str_detect(mixscape_class, "NP$"),
            "NP Control",
            str_replace(mixscape_class, " ", "_")
        )
    )

# Set reference and calculate weights
mixscape_modeling_df$comparison_group <- relevel(
    as.factor(mixscape_modeling_df$comparison_group),
    ref = "NP Control"
)

mixscape_modeling_df$measurement_var <- get_measurement_variance(
    mixscape_modeling_df$mtDNA_depth
)
mixscape_modeling_df$weight_empirical <- 1 / mixscape_modeling_df$measurement_var
mixscape_modeling_df$weight_empirical <- mixscape_modeling_df$weight_empirical /
    max(mixscape_modeling_df$weight_empirical, na.rm = TRUE)

# Fit model
mdl_mixscape_weighted <- lm(
    deviation ~ comparison_group * mtDNA_depth,
    data = mixscape_modeling_df,
    weights = weight_empirical
)
ANOVA: Heteroplasmy Variance by Mixscape Class and mtDNA Depth
Term df Sum Sq Mean Sq F statistic p-value
Mixscape Class 4 1.2726 0.3182 82.7193 <0.001
mtDNA_depth 1 0.1105 0.1105 28.7393 <0.001
Mixscape Class:mtDNA_depth 4 0.1795 0.0449 11.6680 <0.001
Residuals 1114 4.2847 0.0038 NA NA
Significant Effects: Mixscape Heteroplasmy Variance Model (p
Type Term Estimate SE CI Lower CI Upper t-stat p-value
Biological (Mixscape Class) Opa1_KO 0.06003 0.01493 0.03076 0.08930 4.01958 <0.001
Biological (Mixscape Class) Polg_KO 0.14102 0.01564 0.11038 0.17167 9.01925 <0.001
Biological (Mixscape Class) Tfam_KO 0.18543 0.01288 0.16018 0.21069 14.39349 <0.001
Interaction Polg_KO:mtDNA_depth -0.00236 0.00095 -0.00421 -0.00050 -2.49076 0.0129
Interaction Tfam_KO:mtDNA_depth -0.00553 0.00086 -0.00722 -0.00384 -6.40364 <0.001
Technical (Depth) mtDNA_depth -0.00050 0.00015 -0.00079 -0.00020 -3.31741 <0.001

The Mixscape linear model confirms that the observed effects are not driven by off-target artifacts. By comparing knockout (KO) cells to non-perturbed (NP) cells within the same gRNA group, we control for any gRNA-specific effects unrelated to the intended gene knockout. The significant main effects for Tfam_KO, Opa1_KO, and Polg_KO (all with positive coefficients and p < 0.001) demonstrate that cells with successful knockouts show genuinely increased heteroplasmy variance compared to cells that received the same gRNA but were not perturbed. This validates that the variance increases observed in Section 4 reflect true biological effects of mitochondrial gene perturbations rather than off-target or transfection-related artifacts.

Simulation-Based Variance Decomposition

To distinguish technical from biological sources of heteroplasmy variance, we performed binomial sampling simulations to model the expected variance arising purely from technical noise at reduced mtDNA depths.

For each gRNA perturbation, we:

  1. Sampled control cells (without replacement) to match the gRNA group size.
  2. Assigned each sampled cell a sequencing depth at the heteroplasmic sites (m.5024) drawn from the gRNA depth distribution (with replacement).
  3. Simulated alternative allele counts via binomial sampling: \(X_i \sim \text{Binomial}(n = \text{depth}_{\text{5024}, i}, \ p = \text{het}_{\text{control}, i})\)
  4. Calculated simulated heteroplasmy: \(\text{het}_{\text{sim}, i} = X_i / \text{depth}_{\text{5024}, i}\)
  5. Computed variance of simulated heteroplasmies across all cells,
  6. Repeated 5,000 times to generate a null distribution of technical variance

We tested whether observed variance exceeds the simulated technical variance using a one-tailed test:

  • \(H_0\): Observed variance ≤ Simulated technical variance (technical noise only)
  • \(H_1\): Observed variance > Simulated technical variance (biological effect present)
  • p-value: Proportion of simulations where simulated variance ≥ observed variance

A significant result (p < 0.05) indicates that the observed heteroplasmy variance cannot be fully explained by technical sampling noise at reduced coverage, providing evidence for a biological bottleneck effect.

Control group included cells with the following gRNA: Eomes, Neurod1, Olig1, Opn4, Rgr, Rrh, NT

Function to simulate heteroplasmy variance for a given gRNA perturbation
simulate_het_for_gRNA <- function(
    mtDNA_data,
    gRNA_name,
    n_perms = 5000,
    n_sim_points_to_plot = 2000,
    col_pal = NULL
) {

    # Filter KO and Control data
    KO_df <- mtDNA_data %>%
        dplyr::filter(top_guide_final_enrich_combined == gRNA_name) %>%
        dplyr::select(Heteroplasmy_all_5024, mtDNA_depth, Depth_all_5024) %>%
        dplyr::filter(
            Depth_all_5024 >= 0,
            Heteroplasmy_all_5024 >= 0,
            Heteroplasmy_all_5024 <= 1) %>%
        drop_na() %>%
        dplyr::mutate(group = gRNA_name)

    control_df <- mtDNA_data %>%
        dplyr::filter(group == "Control") %>%
        dplyr::select(Heteroplasmy_all_5024, mtDNA_depth, Depth_all_5024) %>%
        dplyr::filter(
            Depth_all_5024 >= 0,
            Heteroplasmy_all_5024 >= 0,
            Heteroplasmy_all_5024 <= 1) %>%
        drop_na() %>%
        dplyr::mutate(group = "Control")

    # Combine KO and Control data
    sim_input_df <- bind_rows(KO_df, control_df)

    # Get observed stats
    sim_input_stats <- sim_input_df %>%
        group_by(group) %>%
        dplyr::summarise(
            n_cells = n(),
            mean_depth = mean(Depth_all_5024, na.rm = TRUE),
            var_het = var(Heteroplasmy_all_5024, na.rm = TRUE)
        )

    obs_ko_var <- sim_input_stats %>% dplyr::filter(group == gRNA_name) %>% dplyr::pull(var_het)
    obs_control_var <- sim_input_stats %>% dplyr::filter(group == "Control") %>% dplyr::pull(var_het)

    all_sim_var <- numeric(n_perms)
    all_sim_hets_list <- vector("list", n_perms)
    all_input_hets_list <- vector("list", n_perms)

    # Iterate through each permutation
    for (perm in seq_len(n_perms)) {

        # Sample cells from control group equal to the number of cells in KO group
        control_sample <- sim_input_df %>%
            dplyr::filter(group == "Control") %>%
            sample_n(
                min(nrow(control_df), nrow(KO_df)),
                replace = FALSE) %>%
            dplyr::mutate(group = "Control")

        # Sample mtDNA depths from KO group
        sampled_depths <- sample(KO_df$Depth_all_5024,
                                    size = nrow(control_sample),
                                    replace = TRUE)

        # Sample heteroplasmy levels from control group
        sampled_hets <- control_sample$Heteroplasmy_all_5024
        all_input_hets_list[[perm]] <- sampled_hets

        # Simulate mtDNA copy number
        sim_alt_counts <- numeric(nrow(control_sample))
        sim_hets <- rep(NA_real_, nrow(control_sample))
        valid_indices <- which(sampled_depths > 0)

        # Simulate alt counts for valid indices
        # where sampled_depths > 0
        if (length(valid_indices) > 0) {
            sim_alt_counts[valid_indices] <- rbinom(
                n = length(valid_indices),
                size = sampled_depths[valid_indices],
                prob = sampled_hets[valid_indices]
            )
            # Calculate heteroplasmy for valid indices
            sim_hets[valid_indices] <- sim_alt_counts[valid_indices] / sampled_depths[valid_indices]
        }
        all_sim_var[perm] <- var(sim_hets, na.rm = TRUE)
        all_sim_hets_list[[perm]] <- sim_hets[!is.na(sim_hets)]
    }

    # Flatten the list of simulated heteroplasmies
    all_sim_hets_flat <- na.omit(unlist(all_sim_hets_list))

    # Observed KO stats
    obs_ko_homoplasmy_prop <- mean(KO_df$Heteroplasmy_all_5024 %in% c(0,1), na.rm = TRUE)
    obs_ko_mean_het <- mean(KO_df$Heteroplasmy_all_5024, na.rm = TRUE)

    # Simulated stats
    sim_ko_homoplasmy_prop <- mean(all_sim_hets_flat %in% c(0,1), na.rm = TRUE)
    sim_ko_mean_het <- mean(all_sim_hets_flat, na.rm = TRUE)
    sim_ko_var <- var(all_sim_hets_flat, na.rm = TRUE)

    # Empirical p-value (one-tailed): proportion of simulated variances >= observed KO variance
    p_val <- mean(all_sim_var >= obs_ko_var) # (simulated variance >= observed variance)

    plot_df_hets <- data.frame(
        Heteroplasmy = c(
            control_df$Heteroplasmy_all_5024,
            all_sim_hets_flat,
            KO_df$Heteroplasmy_all_5024
        ),
        Group = factor(c(
            rep("Control", nrow(control_df)),
            rep("Simulated", length(all_sim_hets_flat)),
            rep(gRNA_name, nrow(KO_df))
        ), levels = c("Control", "Simulated", gRNA_name))
    )

    # Subsample points to avoid cluttering
    # Set the number of points to plot for the jitter layer
    n_sim_points <- min(length(all_sim_hets_flat), n_sim_points_to_plot)

    # Filter the main dataframe, subsample only the 'Simulated' group
    jitter_data <- plot_df_hets %>%
        group_by(Group) %>%
        # If the group is 'Simulated', sample n_sim_points, otherwise take all rows
        dplyr::filter(
            if (cur_group()$Group == "Simulated") row_number() %in% sample(row_number(), n_sim_points) else TRUE
        ) %>%
        ungroup()

    # Set colors for Control, Simulated, and gRNA
    if (!is.null(col_pal)) {
        group_colors <- c(
            "Control" = scales::alpha(shades::saturation(col_pal["NT"], 0.8), 0.6),
            "Simulated" = scales::alpha(shades::saturation(col_pal[gRNA_name], 0.4), 0.6),
            gRNA_name = scales::alpha(shades::saturation(col_pal[gRNA_name], 0.8), 0.6)
        )
        names(group_colors)[3] <- gRNA_name
    } else {
        group_colors <- c(
            "Control" = "#2CA02CFF",
            "Simulated" = "#1F77B4FF",
            gRNA_name = "#D62728FF"
        )
        names(group_colors)[3] <- gRNA_name
    }

    het_dist_sim <- ggplot(
        plot_df_hets,
        aes(x = Group, y = Heteroplasmy, fill = Group)
    ) +
        geom_violin(
            data = jitter_data,
            width = 1.2,
            alpha = 0.3,
            color = NA
        ) +
        geom_jitter(
            data = jitter_data,
            width = 0.15,
            size = 1.2,
            alpha = 0.3,
            aes(color = Group)
        ) +
        geom_boxplot(
            data = jitter_data,
            width = 0.2,
            outlier.shape = NA,
            alpha = 0.6,
            position = "identity"
        ) +
        scale_fill_manual(values = group_colors) +
        scale_color_manual(values = group_colors) +
        theme_minimal(base_size = 14) +
        labs(
            title = paste("Distribution of Heteroplasmies (", gRNA_name, " KO Simulation)", sep = ""),
            x = NULL,
            y = "Heteroplasmy (m.5024)"
        ) +
        theme(legend.position = "none")

    return(list(
        plot = het_dist_sim,
        sim_input_stats = sim_input_stats,
        all_sim_var = all_sim_var,
        all_sim_hets_list = all_sim_hets_list,
        obs_ko_mean_het = obs_ko_mean_het,
        obs_ko_var = obs_ko_var,
        obs_ko_homoplasmy_prop = obs_ko_homoplasmy_prop,
        sim_ko_mean_het = sim_ko_mean_het,
        sim_ko_var = sim_ko_var,
        sim_ko_homoplasmy_prop = sim_ko_homoplasmy_prop,
        p_value= p_val
    ))
}
Summary of Observed and Simulated Heteroplasmy Metrics
gRNA
Mean Heteroplasmy
Heteroplasmy Variance
Homoplasmy Proportion
p-value
Akap1 0.5880217 0.5776528 0.0126753 0.0153647 0.0027100 0.0054537 0.9816
Nnt 0.5915305 0.5775939 0.0154287 0.0164833 0.0068493 0.0096616 0.7602
Polg 0.5810198 0.5778270 0.0408603 0.0379683 0.0921986 0.0822156 0.2362
Tfam 0.5928463 0.5772007 0.0700025 0.0597565 0.1889764 0.1712425 0.0372
Dnm1l 0.5776302 0.5776180 0.0158225 0.0153254 0.0049628 0.0059097 0.3440
Mtfp1 0.5854568 0.5776063 0.0114118 0.0146212 0.0000000 0.0038865 0.9940
Opa1 0.5980599 0.5777288 0.0252051 0.0199634 0.0163399 0.0157373 0.0128
Snx9 0.5909775 0.5774928 0.0139536 0.0156394 0.0035461 0.0070794 0.8538
Atg5 0.5902907 0.5777904 0.0148085 0.0152226 0.0032051 0.0060244 0.5912
Oma1 0.5824195 0.5775588 0.0155447 0.0167433 0.0132626 0.0129263 0.7528
Pink1 0.5771050 0.5777252 0.0128299 0.0156186 0.0042644 0.0070554 0.9918
Ppargc1a 0.5822394 0.5777009 0.0127014 0.0153192 0.0063291 0.0059184 0.9668
Prkn 0.5793903 0.5776200 0.0136172 0.0153763 0.0056818 0.0067477 0.8984
Eomes 0.5688203 0.5776052 0.0151203 0.0157932 0.0042017 0.0080513 0.6134
Neurod1 0.5724236 0.5776236 0.0109754 0.0145426 0.0000000 0.0032442 0.9980
Olig1 0.5817256 0.5775248 0.0117112 0.0147911 0.0000000 0.0037015 0.9858
Opn4 0.5798140 0.5776032 0.0122437 0.0150643 0.0034130 0.0057590 0.9752
Rgr 0.5720849 0.5776398 0.0144841 0.0162799 0.0065789 0.0082730 0.8616
Rrh 0.5795061 0.5775674 0.0132620 0.0165454 0.0080000 0.0113448 0.9586
NT 0.5883744 0.5776483 0.0132568 0.0150385 0.0000000 0.0049896 0.8900

Only Tfam and Opa1 gRNAs show a significantly higher heteroplasmy variance compared to the simulated variance from the Control group, suggesting that these gRNAs may have a true biological effect on heteroplasmy variance. The Polg gRNA does not show a significant difference, indicating that its effect on heteroplasmy variance may be similar to that of the Control group.

Figure 6: Comparison of observed and simulated heteroplasmy distributions for TFAM, OPA1, and POLG perturbations. Violin plots show the distribution of heteroplasmy values at m.5024. ‘Observed’ distributions represent actual KO cells. ‘Simulated’ distributions represent technical variance expected from binomial sampling of control cells at KO depth distributions. TFAM and OPA1 show broader observed distributions than simulated, indicating biological variance exceeds technical noise.
Figure 7: Distribution of simulated heteroplasmy variance for TFAM, POLG, and OPA1. Histograms and density curves show 5,000 bootstrap simulations of technical variance for each gRNA. Dashed vertical lines indicate observed variance in KO cells (colored) and control cells (gray). One-tailed p-values test whether observed variance exceeds simulated technical variance. Only TFAM (p < 0.05) and OPA1 (p < 0.05) show significant excess variance, while POLG does not (p > 0.05).

P-values in the above plot are one-tailed, testing the null hypothesis that the observed heteroplasmy variance in the gRNA KO group is not significantly greater than the distribution of simulated variances derived from the Control group.

Heteroplasmy Variance for Significant gRNAs (p < 0.05)
Heteroplasmy Variance
gRNA Observed Technical Biological
Tfam 0.0700 0.0598 0.0102
Opa1 0.0252 0.0200 0.0052
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS 26.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/London
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] car_3.1-3           carData_3.0-5       kableExtra_1.4.0.19
 [4] knitr_1.50          paletteer_1.6.0     plotly_4.11.0      
 [7] shades_1.4.0        scales_1.4.0        ggrepel_0.9.6      
[10] ggridges_0.5.6      patchwork_1.3.1     broom_1.0.9        
[13] lubridate_1.9.4     forcats_1.0.0       dplyr_1.1.4        
[16] purrr_1.1.0         readr_2.1.5         tidyr_1.3.1        
[19] tibble_3.3.0        ggplot2_3.5.2       tidyverse_2.0.0    
[22] stringr_1.5.1       here_1.0.1          conflicted_1.2.0   

loaded via a namespace (and not attached):
 [1] gtable_0.3.6       xfun_0.52          htmlwidgets_1.6.4  lattice_0.22-6    
 [5] tzdb_0.5.0         crosstalk_1.2.1    vctrs_0.6.5        tools_4.4.2       
 [9] generics_0.1.4     pkgconfig_2.0.3    Matrix_1.7-1       data.table_1.17.8 
[13] RColorBrewer_1.1-3 lifecycle_1.0.4    compiler_4.4.2     farver_2.1.2      
[17] textshaping_1.0.1  htmltools_0.5.8.1  yaml_2.3.10        lazyeval_0.2.2    
[21] Formula_1.2-5      pillar_1.11.0      cachem_1.1.0       abind_1.4-8       
[25] nlme_3.1-168       tidyselect_1.2.1   digest_0.6.37      stringi_1.8.7     
[29] rematch2_2.1.2     splines_4.4.2      labeling_0.4.3     rprojroot_2.1.0   
[33] fastmap_1.2.0      grid_4.4.2         cli_3.6.5          magrittr_2.0.3    
[37] withr_3.0.2        backports_1.5.0    timechange_0.3.0   rmarkdown_2.29    
[41] httr_1.4.7         ragg_1.4.0         hms_1.1.3          memoise_2.0.1     
[45] evaluate_1.0.4     viridisLite_0.4.2  mgcv_1.9-3         rlang_1.1.6       
[49] Rcpp_1.1.0         glue_1.8.0         xml2_1.3.8         svglite_2.2.1     
[53] rstudioapi_0.17.1  jsonlite_2.0.0     R6_2.6.1           prismatic_1.1.2   
[57] systemfonts_1.2.3